# import libraries
import numpy as np
import pandas as pd
import re
import os
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.utils import shuffle
from matplotlib.backends.backend_pdf import PdfPages
from sklearn.preprocessing import LabelEncoder
from patsy import dmatrices
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from scipy.stats import entropy
import statsmodels.formula.api as sm
import scipy.stats as stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
import itertools
import inspect
import datetime
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import chi2
from scipy.stats import entropy
import datetime
import plotly.graph_objects as go
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, roc_auc_score, classification_report, accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from dateutil.relativedelta import relativedelta
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import model_selection,ensemble,metrics
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV,StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
import joblib
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from subprocess import check_output
from sklearn import model_selection
from IPython.display import display, HTML
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches
from scipy.stats import chi2_contingency
import seaborn as sns
from sklearn import preprocessing
%matplotlib inline
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import pandas as pd
from sklearn.linear_model import LogisticRegression
from statsmodels.tools import add_constant
import statsmodels.api as sm
#importing data
data1=pd.read_excel(r"C:\Users\40102404\Downloads\BE\model_validation.xlsx")
# Separate the features and target variable
data = data1.drop('customer', axis=1)
X = data.drop('default', axis=1)
y = data['default']
X = add_constant(X)
# Create the logistic regression model
logit_model = sm.Logit(y, X)
# Fit the model on the data
result = logit_model.fit()
# Get the coefficients and intercept
coefficients = result.params[1:]
intercept = result.params[0]
# Create a dataframe with the coefficients and intercept
coefficients_df = pd.DataFrame({'Variable': X.columns[1:], 'Coefficient': coefficients})
coefficients_df = coefficients_df.append({'Variable': 'Intercept', 'Coefficient': intercept}, ignore_index=True)
# Add the probability of default to the dataframe
data['Prob'] = result.predict(X)
print(coefficients_df)
print(data)
Optimization terminated successfully.
Current function value: 0.454353
Iterations 8
Variable Coefficient
0 age -0.015311
1 ed -0.028474
2 employ -0.225346
3 address -0.038665
4 income -0.002756
5 debtinc 0.076747
6 creddebt 0.557631
7 othdebt 0.026705
8 Intercept -0.414920
age ed employ address income debtinc creddebt othdebt default Prob
0 28 2 7 2 44 17.7 2.99 4.80 0 0.617119
1 64 5 34 17 116 14.7 5.05 12.00 0 0.002701
2 40 1 20 12 61 4.8 1.04 1.89 0 0.005508
3 30 1 11 3 27 34.5 1.75 7.56 0 0.563058
4 25 1 2 2 30 22.4 0.76 5.96 1 0.703751
.. ... .. ... ... ... ... ... ... ... ...
895 23 1 0 2 20 8.4 0.24 1.44 1 0.472271
896 28 3 3 3 25 18.8 1.16 3.54 1 0.597306
897 26 3 0 3 31 3.1 0.10 0.86 1 0.313629
898 36 5 3 7 45 3.3 0.47 1.02 0 0.162929
899 32 4 2 2 55 10.6 1.43 4.40 1 0.507518
[900 rows x 10 columns]
C:\Users\40102404\AppData\Local\Temp\ipykernel_10204\3736388604.py:22: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
coefficients_df = coefficients_df.append({'Variable': 'Intercept', 'Coefficient': intercept}, ignore_index=True)
data['prob_default']=np.where(data['Prob']>.4,1,0)
factor=28.8539
offset=487.123
data['score']=factor*np.log((1-data['Prob'])/data['Prob'])+offset
data['score']
0 473.349942
1 657.685614
2 637.048893
3 479.806212
4 462.157872
...
895 490.326679
896 475.747291
897 509.721537
898 534.345162
899 486.255272
Name: score, Length: 900, dtype: float64
#assigning grades
data['grade']=np.where(data['Prob']<=.0017,1,np.where(data['Prob']<.0029,2,np.where(data['Prob']<.0050,3,np.where(data['Prob']<.0088,4,np.where(data['Prob']<.0155,5,
np.where(data['Prob']<.0272,6,np.where(data['Prob']<.0476,7,np.where(data['Prob']<.0833,8,np.where(data['Prob']<.1458,9,np.where(data['Prob']<.2552,10,np.where(data['Prob']<.4466,11,np.where(data['Prob']<.7686,12,13))))))))))))
#AUC
AUROC = roc_auc_score(data['default'],data['Prob'])
print(" Area under ROC :",AUROC)
Area under ROC : 0.8429353930341373
print(classification_report(data['default'],data['prob_default']))
precision recall f1-score support
0 0.82 0.76 0.79 571
1 0.63 0.70 0.66 329
accuracy 0.74 900
macro avg 0.72 0.73 0.73 900
weighted avg 0.75 0.74 0.74 900
#Gini Index
Gini = AUROC * 2 - 1
print("Gini Index :",Gini)
Gini Index : 0.6858707860682747
#Confusion Matrix
fig, ax = plt.subplots(figsize=(8,6), dpi=100)
conf_matrix = confusion_matrix(data['default'],data['prob_default'])
display = ConfusionMatrixDisplay(conf_matrix)
ax.set(title='Confusion Matrix for data')
display.plot(ax=ax);
# Calculate Somers' D
X['probability'] = result.predict(X)
concordant_pairs = ((y == 1) & (X['probability'] > 0.4)).sum()
discordant_pairs = ((y == 0) & (X['probability'] > 0.4)).sum()
total_pairs = concordant_pairs + discordant_pairs
somers_d = (concordant_pairs - discordant_pairs) / total_pairs
print("\nSomers' D:")
print(somers_d)
Somers' D: 0.25885558583106266
# Calculate the chi-square statistic for the decile
data2 = data1.drop('customer', axis=1)
X = data2.drop('default', axis=1)
y = data2['default']
X = add_constant(X)
# Calculate the probability values of default
X_without_intercept = X.drop('const', axis=1)
probabilities = result.predict(X)
deciles = pd.qcut(probabilities, q=10, labels=False)
decile_counts = pd.crosstab(deciles, y)
chi_square = decile_counts.apply(lambda x: (x - x.mean()) ** 2 / x.mean()).sum().sum()
print(chi_square)
306.9435055014665
#Cumulative Information Efficiency Ratio (CIER)
def calculate_cier(y_true, y_pred):
n = len(y_true)
sorted_indices = np.argsort(y_pred)
sorted_true = np.asarray(y_true)[sorted_indices]
sorted_pred = np.asarray(y_pred)[sorted_indices]
cum_sum_true = np.cumsum(sorted_true)
cum_sum_pred = np.cumsum(sorted_pred)
cier = np.sum(cum_sum_true / np.arange(1, n + 1)) / np.sum(cum_sum_pred / np.arange(1, n + 1))
return cier
cier = calculate_cier(data['default'],data['Prob'])
# Binomial normal test
def perform_binomial_normal_test(y_true, y_pred):
n = len(y_true)
num_events = sum(y_true)
p_pred = sum(y_pred) / n
p_actual = num_events / n
p_diff = p_pred - p_actual
std_error = np.sqrt(p_actual * (1 - p_actual) / n)
z_score = p_diff / std_error
binomial_test = 2 * (1 - stats.norm.cdf(np.abs(z_score)))
return binomial_test
binomial_test = perform_binomial_normal_test(data['default'],data['Prob'])
#hosmer lemeshow test
def perform_hosmer_lemeshow_test(y_true, y_pred, num_bins=10):
df = pd.DataFrame({'ground_truth': y_true, 'predictions': y_pred})
df['decile'] = pd.qcut(df['predictions'], num_bins, labels=False)
grouped = df.groupby('decile', as_index=False)
observed_events = grouped['ground_truth'].sum()
observed_nonevents = grouped['ground_truth'].count() - observed_events
expected_events = grouped['predictions'].sum()
expected_nonevents = grouped['predictions'].count() - expected_events
chi_square = (((observed_events - expected_events) ** 2) / expected_events).sum() + \
(((observed_nonevents - expected_nonevents) ** 2) / expected_nonevents).sum()
degrees_of_freedom = num_bins - 2
lm_test = 1 - chi2.cdf(chi_square, degrees_of_freedom)
return lm_test
lm_test = perform_hosmer_lemeshow_test(data['default'],data['Prob'] )
print('lm_test:', lm_test)
lm_test: [1. 1. 1.]
# CAP
total = len(data['default'])
class_1_count = np.sum(data['default'])
class_0_count = total - class_1_count
plt.figure(figsize = (20, 12))
# Random Model
plt.plot([0, total], [0, class_1_count], c = 'r', linestyle = '--', label = 'Random Model')
# Perfect Model
plt.plot([0, class_1_count, total],
[0, class_1_count, class_1_count],
c = 'grey',
linewidth = 2,
label = 'Perfect Model')
# Trained Model
probs = data['Prob']
model_y = [y for _, y in sorted(zip(probs, data['default']), reverse = True)]
y_values = np.append([0], np.cumsum(model_y))
x_values = np.arange(0, total + 1)
plt.plot(x_values,
y_values,
c = 'b',
label = 'Credit risk model',
linewidth = 4)
# Plot information
plt.xlabel('Total observations', fontsize = 7)
plt.ylabel('Class 1 observations', fontsize = 7)
plt.title('Cumulative Accuracy Profile', fontsize = 7)
plt.legend(loc = 'lower right', fontsize = 7)
<matplotlib.legend.Legend at 0x227736aee80>
# Kendalltau
res = stats.kendalltau(data['default'], data['Prob'])
res.correlation
0.46738223959132313
#Brier score
brier_score_loss = metrics.brier_score_loss(data['default'],data['Prob'])
print("{} : {:.4f}".format("Brier Score Loss", brier_score_loss))
Brier Score Loss : 0.1527
#Entropy and KL divergence
def calculate_unconditional_entropy(y_true):
true_distribution = np.histogram(y_true, bins=10, range=(0, 1), density=True)[0]
unconditional_entropy = entropy(true_distribution)
return unconditional_entropy
def calculate_conditional_entropy(y_true, y_pred):
true_distribution = np.histogram(y_true, bins=10, range=(0, 1), density=True)[0]
pred_distribution = np.histogram(y_pred, bins=10, range=(0, 1), density=True)[0]
conditional_entropy = entropy(true_distribution, pred_distribution)
return conditional_entropy
unconditional_entropy = calculate_unconditional_entropy(data['default'])
conditional_entropy = calculate_conditional_entropy(data['default'],data['Prob'])
KL_divergence=conditional_entropy-unconditional_entropy
# KS statistics
def k_s_statistics_gain_lift(data,predicted_probability,ground_truth,response_name='default'):
data= data.sort_values(by=predicted_probability, ascending=False)
data['decile_group'] = pd.qcut(data[predicted_probability], q=10)
KS_data = data.groupby('decile_group').agg(
[
'count',
'sum',
]
)[ground_truth].sort_index(ascending=False)
KS_data.columns = ['Total count','Number of '+response_name]
KS_data['Number of '+'Non-'+response_name]=KS_data['Total count']-KS_data['Number of '+response_name]
KS_data[response_name+'_Rate'+'%'] = (KS_data['Number of '+response_name] / KS_data['Total count']).apply(lambda x:round(100*x,2))
KS_data['Percent of '+response_name+'%'] = (KS_data['Number of '+response_name]/KS_data['Number of '+response_name].sum()).apply(lambda x:round(100*x,2))
KS_data['Percent of '+'Non-'+response_name+'%'] = (KS_data['Number of '+'Non-'+response_name]/KS_data['Number of '+'Non-'+response_name].sum()).apply(lambda x:round(100*x,2))
KS_data['ks_stats'] = np.round(((KS_data['Number of '+response_name] / KS_data['Number of '+response_name].sum()).cumsum() -(KS_data['Number of '+'Non-'+response_name] / KS_data['Number of '+'Non-'+response_name].sum()).cumsum()), 4) * 100
KS_data['max_ks'] = KS_data['ks_stats'].apply(lambda x: '*****' if x == KS_data['ks_stats'].max() else '')
KS_data['Gain'] = KS_data['Percent of '+response_name+'%'].cumsum()
KS_data['Lift'] = (KS_data['Gain']/np.array(range(10,100+10,10))).apply(lambda x:round(x,2))
return KS_data
#KS statistics table
logistic_ks_data=k_s_statistics_gain_lift(data=data,predicted_probability='Prob',ground_truth='default')
logistic_ks_data
| Total count | Number of default | Number of Non-default | default_Rate% | Percent of default% | Percent of Non-default% | ks_stats | max_ks | Gain | Lift | |
|---|---|---|---|---|---|---|---|---|---|---|
| decile_group | ||||||||||
| (0.795, 1.0] | 90 | 84 | 6 | 93.33 | 25.53 | 1.05 | 24.48 | 25.53 | 2.55 | |
| (0.626, 0.795] | 90 | 60 | 30 | 66.67 | 18.24 | 5.25 | 37.46 | 43.77 | 2.19 | |
| (0.5, 0.626] | 90 | 48 | 42 | 53.33 | 14.59 | 7.36 | 44.70 | 58.36 | 1.95 | |
| (0.408, 0.5] | 90 | 37 | 53 | 41.11 | 11.25 | 9.28 | 46.66 | 69.61 | 1.74 | |
| (0.324, 0.408] | 90 | 43 | 47 | 47.78 | 13.07 | 8.23 | 51.50 | ***** | 82.68 | 1.65 |
| (0.242, 0.324] | 90 | 24 | 66 | 26.67 | 7.29 | 11.56 | 47.24 | 89.97 | 1.50 | |
| (0.169, 0.242] | 90 | 18 | 72 | 20.00 | 5.47 | 12.61 | 40.10 | 95.44 | 1.36 | |
| (0.0784, 0.169] | 90 | 12 | 78 | 13.33 | 3.65 | 13.66 | 30.09 | 99.09 | 1.24 | |
| (0.0142, 0.0784] | 90 | 3 | 87 | 3.33 | 0.91 | 15.24 | 15.76 | 100.00 | 1.11 | |
| (-0.00099818, 0.0142] | 90 | 0 | 90 | 0.00 | 0.00 | 15.76 | -0.00 | 100.00 | 1.00 |
#ploting KS statistics
def plot_cdfs_credit_risk(data, predicted_probability, ground_truth, response_name='default'):
# Sort the data by predicted probability
sorted_data = data.sort_values(predicted_probability)
# Separate the data into default and non-default groups
default_data = sorted_data[sorted_data[ground_truth] == 1]
non_default_data = sorted_data[sorted_data[ground_truth] == 0]
# Calculate the cumulative probabilities
default_cdf = np.cumsum(default_data[predicted_probability]) / np.sum(default_data[predicted_probability])
non_default_cdf = np.cumsum(non_default_data[predicted_probability]) / np.sum(non_default_data[predicted_probability])
# Plot the CDFs
plt.plot(default_data[predicted_probability], default_cdf, label=response_name)
plt.plot(non_default_data[predicted_probability], non_default_cdf, label='Non-' + response_name)
plt.xlabel('Predicted Probability')
plt.ylabel('Cumulative Probability')
plt.title('CDFs in Credit Risk')
plt.legend()
plt.grid(True)
plt.show()
# Calculate the KS statistic
ks_statistic = np.max(np.abs(default_cdf - non_default_cdf))
return ks_statistic
ks_statistic = plot_cdfs_credit_risk(data=data,predicted_probability='Prob',ground_truth='default')
#gain Chart
def model_selection_by_gain_chart(model_gains_dict):
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(0,100+10,10)), y=list(range(0,100+10,10)),
mode='lines+markers',name='Random Model'))
for model_name,model_gains in model_gains_dict.items():
model_gains.insert(0,0)
fig.add_trace(go.Scatter(x=list(range(0,100+10,10)), y=model_gains,
mode='lines+markers',name=model_name))
fig.update_xaxes(
title_text = "% of Data Set",)
fig.update_yaxes(title_text = "% of Gain",)
fig.update_layout(title='Gain Charts',)
fig.show()
model_selection_by_gain_chart(model_gains_dict={'Credit risk model':logistic_ks_data.Gain.to_list()})
# lift chart
def model_selection_by_lift_chart(model_lift_dict):
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(10,100+10,10)), y=np.repeat(1,10),
mode='lines+markers',name='Random Lift'))
for model_name,model_lifts in model_lift_dict.items():
fig.add_trace(go.Scatter(x=list(range(10,100+10,10)), y=model_lifts,
mode='lines+markers',name=model_name))
fig.update_xaxes(
title_text = "% of Data Set",)
fig.update_yaxes(title_text = "Lift",)
fig.update_layout(title='Lift Charts',)
fig.show()
model_selection_by_lift_chart(model_lift_dict={'Credit Risk Model':logistic_ks_data.Lift.to_list()})
# Cohen's d statistic
def calculate_cohens_d(y_true, y_pred):
mean_diff = np.mean(y_true) - np.mean(y_pred)
pooled_std = np.sqrt((np.var(y_true) + np.var(y_pred)) / 2)
cohens_d = mean_diff / pooled_std
return cohens_d
cohens_d = calculate_cohens_d(data['default'],data['Prob'])
print("Cohen's d:", cohens_d)
Cohen's d: 0.0
#importing new datasets
psi=pd.read_excel(r"C:\Users\40102404\Downloads\BE\PSI.xlsx")
psi
| customer | age | ed | employ | address | income | debtinc | creddebt | othdebt | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 360036 | 24 | 4 | 0 | 1 | 56 | 9.3 | 1.55 | 3.66 |
| 1 | 360188 | 19 | 2 | 0 | 0 | 55 | 7.9 | 0.61 | 3.73 |
| 2 | 360230 | 28 | 4 | 2 | 3 | 22 | 6.2 | 0.76 | 0.61 |
| 3 | 360245 | 40 | 5 | 1 | 11 | 32 | 18.6 | 3.40 | 2.55 |
| 4 | 360349 | 33 | 5 | 6 | 4 | 121 | 15.1 | 3.31 | 14.96 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 595 | 453471 | 34 | 3 | 8 | 4 | 83 | 11.0 | 1.85 | 7.28 |
| 596 | 453578 | 37 | 2 | 10 | 8 | 43 | 3.6 | 0.81 | 0.74 |
| 597 | 453686 | 25 | 5 | 0 | 3 | 16 | 3.2 | 0.29 | 0.22 |
| 598 | 453698 | 34 | 1 | 10 | 8 | 41 | 14.5 | 1.19 | 4.75 |
| 599 | 453777 | 27 | 1 | 2 | 2 | 24 | 5.8 | 0.56 | 0.84 |
600 rows × 9 columns
#calculating score and probablity of new dataset
psi = psi.drop('customer', axis=1)
psi = add_constant(psi)
psi['Prob']=result.predict(psi)
psi['score']=factor*np.log((1-psi['Prob'])/psi['Prob'])+offset
#assigning grade to new dataset
psi['grade']=np.where(psi['Prob']<=.0017,1,np.where(psi['Prob']<.0029,2,np.where(psi['Prob']<.0050,3,np.where(psi['Prob']<.0088,4,np.where(psi['Prob']<.0155,5,
np.where(psi['Prob']<.0272,6,np.where(psi['Prob']<.0476,7,np.where(psi['Prob']<.0833,8,np.where(psi['Prob']<.1458,9,np.where(psi['Prob']<.2552,10,np.where(psi['Prob']<.4466,11,np.where(psi['Prob']<.7686,12,13))))))))))))
psi
| const | age | ed | employ | address | income | debtinc | creddebt | othdebt | Prob | score | grade | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 24 | 4 | 0 | 1 | 56 | 9.3 | 1.55 | 3.66 | 0.642572 | 470.198938 | 12 |
| 1 | 1.0 | 19 | 2 | 0 | 0 | 55 | 7.9 | 0.61 | 3.73 | 0.532882 | 483.322444 | 12 |
| 2 | 1.0 | 28 | 4 | 2 | 3 | 22 | 6.2 | 0.76 | 0.61 | 0.338740 | 506.423802 | 11 |
| 3 | 1.0 | 40 | 5 | 1 | 11 | 32 | 18.6 | 3.40 | 2.55 | 0.815019 | 444.333841 | 13 |
| 4 | 1.0 | 33 | 5 | 6 | 4 | 121 | 15.1 | 3.31 | 14.96 | 0.622788 | 472.655720 | 12 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 595 | 1.0 | 34 | 3 | 8 | 4 | 83 | 11.0 | 1.85 | 7.28 | 0.242903 | 519.924958 | 10 |
| 596 | 1.0 | 37 | 2 | 10 | 8 | 43 | 3.6 | 0.81 | 0.74 | 0.048711 | 572.874377 | 8 |
| 597 | 1.0 | 25 | 5 | 0 | 3 | 16 | 3.2 | 0.29 | 0.22 | 0.334707 | 506.944803 | 11 |
| 598 | 1.0 | 34 | 1 | 10 | 8 | 41 | 14.5 | 1.19 | 4.75 | 0.149761 | 537.227016 | 10 |
| 599 | 1.0 | 27 | 1 | 2 | 2 | 24 | 5.8 | 0.56 | 0.84 | 0.338248 | 506.487185 | 11 |
600 rows × 12 columns
#calculation PSI values across different grade
dev = data['grade'].value_counts(normalize=True)
new = psi['grade'].value_counts(normalize=True)
df = pd.concat([dev, new], axis=1, keys=['dev', 'new'])
df = df.sort_index()
df['PSI'] = (df['dev']-df['new']) * np.log(df['dev'] / df['new'])
df['PSI']
1 0.000153 2 0.011297 3 0.000927 4 0.002839 5 0.000569 6 0.001808 7 0.000194 8 0.000567 9 0.000004 10 0.000080 11 0.001446 12 0.001425 13 0.006545 Name: PSI, dtype: float64
#total PSI value
np.sum(df['PSI'])
0.027854160655260505